{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing of Project-level TCGA CNA Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import numpy.matlib\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import mygene\n",
    "import h5py\n",
    "import os, sys, math\n",
    "import umap\n",
    "from functools import reduce\n",
    "\n",
    "sys.path.append(os.path.abspath('../preprocessing/'))\n",
    "import preprocessing_utils as utils\n",
    "\n",
    "bestSplit = lambda x: (round(math.sqrt(x)), math.ceil(x / round(math.sqrt(x))))\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "NORMALIZE_FOR_GENE_LENGTH = True\n",
    "\n",
    "gencode_annotation_path = '../../data/pancancer/gencode/gencode.v28.basic.annotation.gff3'\n",
    "cna_base_dir = '../../data/pancancer/TCGA/mutation/CNA/'\n",
    "ultra_mutated_samples_path = '../../data/pancancer/TCGA/mutation/ultramutated_tumor_ids'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1: Compute Exonic Gene Length for all Genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1gAAAHwCAYAAAC7T84CAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAABBqUlEQVR4nO3de5hdZXn///fHIJEYCCKniJQRiiglGCU/TxWLQj0UW08oiaihtY22asWvJzy0Tb+VimdKsVqqCFolHpBqAWkRTAn9ojKRQAigiERLFDkoAQwqhvv3x15Tt+Mc9mTWZE8y79d17WvWek7rXntdDNw8z3omVYUkSZIkafIe0O8AJEmSJGl7YYIlSZIkSS0xwZIkSZKklphgSZIkSVJLTLAkSZIkqSUmWJIkSZLUEhMsSZK2QUk+kuSv+h2HJOnXmWBJksaUZHaSjyX5XpK7k6xJ8uxhbY5Mcn2STUm+mmS/McZbn+SoqY98cpIckeTmcdqcmeQXSe5J8uMkFyV51BZeb2WSP+21fVW9qqr+bkuuJUmaOiZYkqTx7AD8D/B7wDzgHcBnkwwAJNkd+ALwV8BuwCDwmb5E2h/vqaq5wMOBW4EzhzdIh//OlaQZwF/2kqQxVdVPq2p5Va2vqvur6jzgJuCwpskLgHVV9bmq+hmwHHjMRGdympmyU5L8oPmckmR2U3dEkpuTvCHJrUl+mOSPu/o+NMm/J7kryRVJ3pnksq76RzWzSz9O8q0kL+6q+4Mk1zazcxuSvDHJg4EvAw9rZqfuSfKwcb6nTcCngUOacVcmOSnJfwObgP2TPLmJb2Pz88lN25OAw4HTmmud1kPcZyZ5Z4/fz2/c40SejSSpdyZYkqQJSbIX8EhgXVP0O8BVQ/VV9VPgxqZ8It4OPBFYCDwGeDyd2bIhe9OZQdsHeAXwoSQPaeo+BPy0abO0+QzF+2DgIjrJz57AYuCfkhzcNPkY8Mqq2plOcnRJcw/PBn5QVXObzw/GCj7JXOA44Mqu4pcBy4CdgbuB84FTgYcCHwDOT/LQqno7sAp4TXOt1/QQ93BjfT+/cY9j3YskacuZYEmSepbkgcCngLOq6vqmeC6wcVjTjXSSiok4Dvi/VXVrVd0G/C2dBGXIfU39fVV1AXAPcFCSWcALgb+pqk1VdS1wVle/5wDrq+rjVfXLqroSOAd4Ude4ByfZpap+UlXfnGDcb0xyJ/AdOt/F8V11Z1bVuqr6JfAM4Iaq+mQTx9nA9cAfjjLueHEPN+L309I9SpJ6ZIIlSepJ8w7RJ4FfAK/pqroH2GVY813ozNhMxMOA73Wdf68pG3JHk6gM2UQnodmDX70nNqT7eD/gCUnuHPrQSeb2bupfCPwB8L0k/5XkSROM+31VtWtV7V1Vf1RVN44Sx/D7oznfZ5Rxx4t7uNG+H5j8PUqSemSCJUkaV5LQWWa2F/DCqrqvq3odnSV9Q20fDBzAr5YQ9uoHdJKKIb/VlI3nNuCXdDaZGLJv1/H/AP/VJEFDn7lV9ecAVXVFVT2XzjK8fwM+2/SrCcY/ku4xht8fdO5xwyjXGzPuCQUx+j1KklpmgiVJ6sWHgUcDf1hV9w6rOxc4JMkLkzwI+Gvg6q4lhCN5YJIHdX12AM4G3pFkj2Znwr8G/nW8wKpqM51dDJcnmdNsrvHyribnAY9M8rIkD2w+/1+SRyfZMclxSeY1SeNdwP1Nvx8BD00yb7wYenRBE8dLkuyQ5Fjg4Ca+oevt30vcE7noOPcoSWqZCZYkaUzp/E2rV9LZfOKWrl31jgNo3pd6IXAS8BPgCXQ2ZBjLBcC9XZ/lwDvpbPF+NbAW+GZT1ovX0Nng4RY6yxjPBn7exHc3nfefFtOZRboFeDcwu+n7MmB9kruAV9FZhkeTIJ4NfLdZojfmLoLjqao76LxX9QbgDuDNwHOq6vamyT8AxyT5SZJTe4h7Ika8R0lS+1LVxgoISZKmjyTvBvauqqXjNpYkqUXOYEmStnnN34s6NB2Pp7NN+bn9jkuSNPPs0O8AJElqwc50lvM9jM67TO8HvtjXiCRJM5JLBCVJkiSpJS4RlCRJkqSWmGBJkiRJUktm9DtYu+++ew0MDPQ7DEmSJEnT1OrVq2+vqj16bT+jE6yBgQEGBwf7HYYkSZKkaSrJ9ybS3iWCkiRJktQSEyxJkiRJaokJliRJkiS1xARLkiRJklpigiVJkiRJLTHBkiRJkqSWmGBJkiRJUktMsCRJkiSpJSZYkiRJktQSEyxJkiRJaokJliRJkiS1xARLkiRJklpigiVJkiRJLTHBkiRJkqSWmGBJkiRJUktMsCRJkiSpJSZYkiRJktQSEyxJkiRJaskO/Q6gn9Zu2MjAief3Owy1YP3JR/c7BEmSJMkZLEmSJElqiwmWJEmSJLXEBEuSJEmSWjIt38FKshlYCzwQ+CXwCeCDVXV/kjnAvwCHAgHuBI4Dvth03xvYDNzWnD++qn6x9aKXJEmSNFNNywQLuLeqFgIk2RP4NLAL8DfA64AfVdWCpv4g4Jau9suBe6rqfVs/bEmSJEkz2bRfIlhVtwLLgNckCTAf2NBV/62q+nm/4pMkSZKkIdM+wQKoqu8Cs4A9gTOAtyS5PMk7kxzY3+gkSZIkqWObSLC6VdUaYH/gvcBuwBVJHt1r/yTLkgwmGdy8aeMURSlJkiRpJpqu72D9miT709m44laAqroH+ALwhST3A38AXNfLWFV1OnA6wOz5B9aUBCxJkiRpRpr2M1hJ9gA+ApxWVZXkd5M8pKnbETgY+F4/Y5QkSZIkmL4zWDslWcOvtmn/JPCBpu4A4MPNhhcPAM4HzulHkJIkSZLUbVomWFU1a4y6T9D5u1ij1S+fipgkSZIkaTzTfomgJEmSJG0rTLAkSZIkqSUmWJIkSZLUkmn5DtbWsmCfeQyefHS/w5AkSZK0nXAGS5IkSZJaYoIlSZIkSS0xwZIkSZKklszod7DWbtjIwInn9zsMTYH1vlsnSZKkPnAGS5IkSZJaYoIlSZIkSS0xwZIkSZKklvQtwUry/CRrhn3uT/LnSe4dVv7yps/6JGubsrVJntuU75vkq0muTbIuyev6dV+SJEmSZq6+bXJRVecC5w6dJ1kGHAf8B3BjVS0cpevTqur2JAcB/wl8Efgl8Iaq+maSnYHVSS6qqmun9CYkSZIkqcu02EUwySOBvwaeTO+zarsAPwGoqh8CP2yO705yHbAPYIIlSZIkaavpe4KV5IHAp+nMQH0/yQBwQJI1Xc1eW1WrmuOvJgmwP/DiEcYbAB4LfH0q45YkSZKk4fqeYAF/B6yrqs90lfWyRPAA4OIkK6vqHoAkc4FzgBOq6q6ROjdLEZcBzNplj7buQZIkSZL6u4tgkiOAFwKvmWjfqroR+BFwcDPWA+kkV5+qqi+M0e/0qlpUVYtmzZm3JWFLkiRJ0oj6NoOV5CHAx4GXVNXdW9B/T+ARwPeaJYMfA66rqg+0G6kkSZIk9aafSwRfBewJfLiTH/2vs/nNd7DOqKpTm+OvJtkMPBA4sap+lOQpwMuAtV393lZVF0zlDUiSJElSt35u0/4u4F2jVL97lD4Do5RfBmSkOkmSJEnaWvr6DpYkSZIkbU9MsCRJkiSpJSZYkiRJktSS6fB3sPpmwT7zGDz56H6HIUmSJGk74QyWJEmSJLXEBEuSJEmSWmKCJUmSJEktmdHvYK3dsJGBE8/vdxjqg/W+eydJkqQp4AyWJEmSJLXEBEuSJEmSWmKCJUmSJEktmdIEK0kl+deu8x2S3JbkvGHt/i3J10bo/8Yk1ydZk+SKJC9vylcm+VZTvibJMU35GUluTXLNVN6XJEmSJI1kqmewfgockmSn5vz3gQ3dDZLsChwGzEuyf1f5q5r2j6+qhcCRQLq6HldVC5vP55uyM4FnTcF9SJIkSdK4tsYSwQuAoS3blgBnD6t/AfDvwApgcVf524A/r6q7AKrqrqo6a6wLVdWlwI/bCFqSJEmSJmprJFgrgMVJHgQcCnx9WP1Q0nV2c0ySXYCdq+q7Y4z7qa4lgg+dgrglSZIkaUKm/O9gVdXVSQboJE8XdNcl2Qs4ELisqirJfUkOAb7fw9DHVdXgRONJsgxYBjBrlz0m2l2SJEmSRrW1dhH8EvA+fnN54IuBhwA3JVkPDABLmmWB93S/k9WWqjq9qhZV1aJZc+a1PbwkSZKkGWxrJVhnAH9bVWuHlS8BnlVVA1U1QGezi6H3sN4FfKhZLkiSuUO7CEqSJEnSdLRVEqyqurmqTu0ua5YN7gd8ravdTcDGJE8APgx8Fbii2XZ9FXD/WNdJcjZwOXBQkpuTvKLVG5EkSZKkMUzpO1hVNXeEspXAyuZ0nxHqH9d1+p7mM7zNEaNcb8kWhClJkiRJrdhaSwQlSZIkabtngiVJkiRJLTHBkiRJkqSWTPnfwZrOFuwzj8GTj+53GJIkSZK2E85gSZIkSVJLTLAkSZIkqSUmWJIkSZLUkhn9DtbaDRsZOPH8foehPlnv+3eSJElqmTNYkiRJktQSEyxJkiRJaokJliRJkiS1pPUEK8nbk6xLcnWSNUmekGRlkkUjtH18kkuTfCvJlUk+mmROU3dE039dkv8a1u95SSrJo4aVX5jkziTntX1fkiRJkjSeVje5SPIk4DnA46rq50l2B3Ycpe1ewOeAxVV1eVN2DLBzkh2BfwKeVVXfT7LnsO5LgMuan3/TVf5eYA7wyhZvS5IkSZJ60vYM1nzg9qr6OUBV3V5VPxil7auBs4aSq6b956vqR8BLgC9U1feb8luH2iSZCzwFeAWwuHvAqroYuLvF+5EkSZKknrWdYP0nsG+Sbyf5pyS/N0bbQ4DVo9Q9EnhIs7RwdZKXd9U9F7iwqr4N3JHksHZClyRJkqTJaTXBqqp7gMOAZcBtwGeSHL8FQ+3QjHM08Ezgr5I8sqlbAqxojlc05z1LsizJYJLBzZs2bkFokiRJkjSy1v/QcFVtBlYCK5OsBZaO0nQdnSTqiyPU3QzcUVU/BX6a5FLgMUluB54OLEhSwCygkrypqqrH+E4HTgeYPf/AnvpIkiRJUi9ancFKclCSA7uKFgLfG6X5acDSJE/o6v+CZvOLLwJPSbJDs6vgE4DrgGOAT1bVflU1UFX7AjcBh7d5H5IkSZK0Jdp+B2sucFaSa5NcDRwMLG/qzk9yc/P5XLOZxWLgfc027dfRWQ54d1VdB1wIXA18A/hoVV1DZzngucOueU5TTpJVdHYmPLK5zjNbvj9JkiRJGlV6XFm3XZo9/8Cav/SUfoehPll/8tH9DkGSJEnTXJLVVfUbf9N3NK3/oWFJkiRJmqlMsCRJkiSpJSZYkiRJktSS1rdp35Ys2Gceg76HI0mSJKklzmBJkiRJUktMsCRJkiSpJSZYkiRJktSSGf0O1toNGxk48fx+h6E+8m9hSZIkqU3OYEmSJElSS0ywJEmSJKklJliSJEmS1JKtkmAleXuSdUmuTrImyZeTvLurfr8k302ya5KVSQa76hYlWdkcH5FkYzPG0Oeopu6MJLcmuWZr3JMkSZIkDTflm1wkeRLwHOBxVfXzJLsDs4FLkpxZVdcB/wD8VVXdmQRgzyTPrqovjzDkqqp6zgjlZwKnAZ+YkhuRJEmSpHFsjRms+cDtVfVzgKq6vao2AK8HPpTkD4Cdq+pTXX3eC7x9IhepqkuBH7cUsyRJkiRN2NZIsP4T2DfJt5P8U5LfA6iqC4CfAGcBfzGsz+XAL5I8bYTxDh+2RPCAKY1ekiRJkno05QlWVd0DHAYsA24DPpPk+Kb6Q8AVVfWtEbq+E3jHCOWrqmph1+fGicSTZFmSwSSDmzdtnEhXSZIkSRrTVtnkoqo2V9XKqvob4DXAC5uq+5vPSH0uAXYCnthyLKdX1aKqWjRrzrw2h5YkSZI0w015gpXkoCQHdhUtBL7XY/d3Am9uPShJkiRJmgJbYwZrLnBWkmuTXA0cDCzvpWPzntZtw4qHv4N1DECSs+m8u3VQkpuTvKK9W5AkSZKk8U35Nu1VtRp48ih1K4GVw8qOGHZ+2LD2I67rq6olkwpUkiRJkiZpq7yDJUmSJEkzgQmWJEmSJLXEBEuSJEmSWjLl72BNZwv2mcfgyUf3OwxJkiRJ2wlnsCRJkiSpJSZYkiRJktQSEyxJkiRJasmMfgdr7YaNDJx4fr/D0DSw3nfxJEmS1AJnsCRJkiSpJSZYkiRJktQSEyxJkiRJaknf38FK8lDg4uZ0b2AzcFtz/mngT4CfAfcB/1hVn0iyEpgP3Nu0e2dVfb4ZbxYwCGyoqudslZuQJEmSJKZBglVVdwALAZIsB+6pqvcleRXwfODxVXVXkl2a8yHHVdXgCEO+DrgO2GVKA5ckSZKkYabzEsG3AX9eVXcBVNVdVXXWWB2SPBw4GvjoVohPkiRJkn5N32ewRtLMVu1cVd8do9mnkgwtETyymQk7BXgzsPMUhyhJkiRJv2E6z2CN57iqWth87kjyHODWqlo9Vqcky5IMJhncvGnjVgpVkiRJ0kwwLROsZlngPUn2n0C33wX+KMl6YAXw9CT/OsLYp1fVoqpaNGvOvHYCliRJkiSmaYLVeBfwoWa5IEnmJnn5aI2r6q1V9fCqGgAWA5dU1Uu3TqiSJEmSNE3fwWp8GJgLXJHkPjrbtL+/vyFJkiRJ0uimVYJVVcu7jgt4T/MZ3u6IccZZCaxsNThJkiRJGsd0XiIoSZIkSdsUEyxJkiRJasm0WiK4tS3YZx6DJx/d7zAkSZIkbSecwZIkSZKklphgSZIkSVJLTLAkSZIkqSUz+h2stRs2MnDi+f0OQ9PAet/FkyRJUgucwZIkSZKklphgSZIkSVJLTLAkSZIkqSUmWJIkSZLUkp4TrCR7J1mR5MYkq5NckOSRTd0JSX6WZN6wPs9OMpjk2iRXJnl/U748yaYke3a1vWdY3+clqSSPGlZ+YZI7k5w3Qoy7J7kvyat6vS9JkiRJaktPCVaSAOcCK6vqgKo6DHgrsFfTZAlwBfCCrj6HAKcBL62qg4FFwHe6hr0deMMYl10CXNb87PZe4GWj9HkR8LUR+kiSJEnSlOt1ButpwH1V9ZGhgqq6qqpWJTkAmAu8g19PbN4MnFRV1zftN1fVh7vqzwCOTbLb8IslmQs8BXgFsLi7rqouBu4eJc4ldJK2fZI8vMd7kyRJkqRW9JpgHQKsHqVuMbACWAUclGSvHvoA3EMnyXrdCHXPBS6sqm8DdyQ5bLwAk+wLzK+qbwCfBY4dpd2yZtni4OZNG8cbVpIkSZJ61sYmF0uAFVV1P3AOnWV6vToVWJpk55HGbI5X0NuSv2PpJFZj9qmq06tqUVUtmjVn3khNJEmSJGmL7NBju3XAMcMLkywADgQu6rymxY7ATXTevVoHHAZcNdqgVXVnkk8Dr+4aczfg6cCCJAXMAirJm6qqxohxCbB3kuOa84clObCqbujxHiVJkiRpUnqdwboEmJ1k2VBBkkPpzEAtr6qB5vMwOonNfnQ2o3hb106DDxhld78PAK/kV8neMcAnq2q/Zsx96SRth48WXHONuVW1z1AswLtwswtJkiRJW1FPCVYzc/R84Khmm/Z1dBKYI+jsLtjtXGBxVV0NnACcneQ64Bpg/xHGvr3pM7spWjLCmOc05SRZBXwOODLJzUmeOV4fSZIkSdoaMvaqu+3b7PkH1vylp/Q7DE0D608+ut8hSJIkaRpKsrqqFvXavo1NLiRJkiRJmGBJkiRJUmt63UVwu7Rgn3kMujRMkiRJUkucwZIkSZKklphgSZIkSVJLTLAkSZIkqSUz+h2stRs2MnDi+f0OQ9OI27VLkiRpMpzBkiRJkqSWmGBJkiRJUktMsCRJkiSpJSZYkiRJktSSVhKsJHsnWZHkxiSrk3w1yaYka5L8OMlNzfFXkgwkqSSv7ep/WpLjm+Mzk2xIMrs53z3J+uZ4IMm9zVhXJfl/SQ5q6h7aXPeeJKe1cV+SJEmSNBGTTrCSBDgXWFlVB1TVYcAJwDOraiHwJeBNVbWwqo5qut0KvC7JjqMMuxn4k1HqbmzGegxwFvC2pvxnwF8Bb5zsPUmSJEnSlmhjButpwH1V9ZGhgqq6qqpWjdHnNuBiYOko9acAr08y3jbyuwA/aa7506q6jE6iJUmSJElbXRt/B+sQYPUW9Hs38OUkZ4xQ933gMuBlwL8PqzsgyRpgZ2AO8ISJXDTJMmAZwKxd9phgyJIkSZI0ur5tclFV3wW+DrxklCbvAt7Eb8Y4tETwADpLEU+f4HVPr6pFVbVo1px5E4xakiRJkkbXRoK1DjhsC/v+PfAWIMMrquoGYA3w4jH6fwl46hZeW5IkSZJa1UaCdQkwu1l6B0CSQ5McPl7HqroeuBb4w1GanMTYm1Y8BbhxArFKkiRJ0pSZ9DtYVVVJng+ckuQtdDaZWE9n+V4vTgKuHGXsdUm+CTyuq3joHawAvwD+dKii2c59F2DHJM8DnlFV107gdiRJkiRpi7WxyQVV9QNGWcpXVccPO19PZ2OMofOr6JpJG6H9C4b13WmMOAYmELYkSZIktapvm1xIkiRJ0vbGBEuSJEmSWtLKEsFt1YJ95jF48tH9DkOSJEnSdsIZLEmSJElqiQmWJEmSJLXEBEuSJEmSWjKj38Fau2EjAyee3+8wNI2s9508SZIkTYIzWJIkSZLUEhMsSZIkSWqJCZYkSZIktcQES5IkSZJaMqUJVpKHJ/likhuS3JjkH5LsmOSIJBuTrElydZKvJNmz6XNQkpVN3XVJTu8a7/FJLk3yrSRXJvlokjlJjk9yf5JDu9pek2RgKu9PkiRJkrpNWYKVJMAXgH+rqgOBRwJzgZOaJquqamFVHQpcAby6KT8V+GBT92jgH5vx9gI+B7ylqg6qqscCFwI7N/1uBt4+VfcjSZIkSeOZyhmspwM/q6qPA1TVZuD1wJ8Ac4YaNYnYzsBPmqL5dJIlmn5rm8NXA2dV1eVddZ+vqh81p+cBv5PkoKm5HUmSJEka21QmWL8DrO4uqKq7gO8Dvw0cnmRNc34UcEbT7IPAJUm+nOT1SXZtyg8ZPt4w9wPvAd42VlBJliUZTDK4edPGid2RJEmSJI2hn5tcDC0R3Bf4OJ3kiGbG69F0lgMeAXwtyewex/w08MQkjxitQVWdXlWLqmrRrDnzJnUDkiRJktRtKhOsa4HDuguS7AL8FvCdYW2/BDx16KSqflBVZ1TVc4Ff0pm9Wjd8vOGq6pfA+4G3TDp6SZIkSZqgqUywLgbmJHk5QJJZdJKfM4FNw9o+BbixafesJA9sjvcGHgpsAE4DliZ5wlCnJC9oNr/odiadJYd7tHw/kiRJkjSmKUuwqqqA5wMvSnID8G3gZ/zqHanDm63YrwJeBryhKX8GcE1T/h/Am6rqlmYzi8XA+5pt2q8DngncPey6v6CzE+GeU3VvkiRJkjSSdPKgmWn2/ANr/tJT+h2GppH1Jx/d7xAkSZI0jSRZXVWLem3fz00uJEmSJGm7YoIlSZIkSS3Zod8B9NOCfeYx6JIwSZIkSS1xBkuSJEmSWmKCJUmSJEktMcGSJEmSpJbM6Hew1m7YyMCJ5/c7DG0j3MJdkiRJ43EGS5IkSZJaYoIlSZIkSS0xwZIkSZKklphgSZIkSVJLWkmwkuydZEWSG5OsTnJBkkc2dSck+VmSecP6PDvJYJJrk1yZ5P1N+fIkm5Ls2dX2nq7jzUnWJLkqyTeTPHnYuLskuTnJaW3cmyRJkiT1atIJVpIA5wIrq+qAqjoMeCuwV9NkCXAF8IKuPocApwEvraqDgUXAd7qGvR14wyiXvLeqFlbVY5rrvGtY/d8Bl07uriRJkiRp4tqYwXoacF9VfWSooKquqqpVSQ4A5gLvoJNoDXkzcFJVXd+031xVH+6qPwM4Nslu41x7F+AnQydJDqOT2P3nZG5IkiRJkrZEGwnWIcDqUeoWAyuAVcBBSfbqoQ/APXSSrNeNULdTs0TweuCjdGasSPIA4P3AG8cKNsmyZmni4OZNG8dqKkmSJEkTMtWbXCwBVlTV/cA5wIsm0PdUYGmSnYeVDy0RfBTwLOATzTLFvwAuqKqbxxq0qk6vqkVVtWjWnHljNZUkSZKkCdmhhTHWAccML0yyADgQuKiT/7AjcBOdd6/WAYcBV402aFXdmeTTwKvHaHN5kt2BPYAnAYcn+Qs6yxJ3THJPVZ24pTcmSZIkSRPRxgzWJcDsJMuGCpIcSmcGanlVDTSfhwEPS7If8F7gbV07DT4gyatGGPsDwCsZJRFM8ihgFnBHVR1XVb9VVQN0lgl+wuRKkiRJ0tY06QSrqgp4PnBUs037Ojo7+x1BZ3fBbucCi6vqauAE4Owk1wHXAPuPMPbtTZ/ZXcVD72CtAT4DLK2qzZO9D0mSJEmarHTyo5lp9vwDa/7SU/odhrYR608+ut8hSJIkaStLsrqqFvXafqo3uZAkSZKkGcMES5IkSZJa0sYugtusBfvMY9BlX5IkSZJa4gyWJEmSJLXEBEuSJEmSWmKCJUmSJEktmdHvYK3dsJGBE8/vdxjahrl1uyRJkro5gyVJkiRJLTHBkiRJkqSWmGBJkiRJUktMsCRJkiSpJdMiwUqyOcmaJNck+VySOU35PSO0XZ5kQ9N+6LNrU/f4JJcm+VaSK5N8dGgsSZIkSZpq0yLBAu6tqoVVdQjwC+BV47T/YNN+6HNnkr2AzwFvqaqDquqxwIXAzlMcuyRJkiQB03Ob9lXAoVvQ79XAWVV1+VBBVX2+tagkSZIkaRzTZQYLgCQ7AM8G1o7T9PVdywO/2pQdAqzu4RrLkgwmGdy8aeMkI5YkSZKkX5kuM1g7JVnTHK8CPjZO+w9W1fu25EJVdTpwOsDs+QfWlowhSZIkSSOZLgnWvVW1cJJjrAMOA744+XAkSZIkaeKm1RLBSToNWJrkCUMFSV7QbH4hSZIkSVNuusxgjWZOkpu7zj/Q/Hx9kpd2lT+vqtYnWQy8L8mewP3ApXR2EpQkSZKkKTctEqyqmjtK+WgzbMtHaX85cHhLYUmSJEnShGxPSwQlSZIkqa9MsCRJkiSpJdNiiWC/LNhnHoMnH93vMCRJkiRtJ5zBkiRJkqSWmGBJkiRJUktMsCRJkiSpJTP6Hay1GzYycOL5/Q5DmpT1vkcoSZI0bTiDJUmSJEktMcGSJEmSpJaYYEmSJElSS0ywJEmSJKklk06wkmxOsibJNUk+l2ROU753khVJbkyyOskFSR6ZZCDJvUmuTHJdkm8kOb5rvOOTnDbsGiuTLGqOT0ryP0nuGdbm+CS3NbGsSfKnk703SZIkSZqINmaw7q2qhVV1CPAL4FVJApwLrKyqA6rqMOCtwF5Nnxur6rFV9WhgMXBCkj/u8Xr/Djx+lLrPNLEsrKqPbvktSZIkSdLEtb1EcBXw28DTgPuq6iNDFVV1VVWtGt6hqr4L/B/gL3u5QFV9rap+2FK8kiRJktSa1hKsJDsAzwbWAocAqyfQ/ZvAo7rOj+1a6rcGWNTjOC9McnWSzyfZd5Q4lyUZTDK4edPGCYQoSZIkSWNrI8HaqUmCBoHvAx/bgjEy7Lx7qd/CZuzx/DswUFWHAhcBZ43UqKpOr6pFVbVo1px5WxCqJEmSJI1shxbGuLdJgv5XknXAMRMY47HAdZMJoqru6Dr9KPCeyYwnSZIkSRM1Vdu0XwLMTrJsqCDJoUkOH94wyQDwPuAfJ3PBJPO7Tv+ISSZskiRJkjRRU5JgVVUBzweOarZpXwe8C7ilaXLA0DbtwGeBU6vq472MneQ9SW4G5iS5Ocnypuovk6xLchWdDTOOb/GWJEmSJGlc6eRCM9Ps+QfW/KWn9DsMaVLWn3x0v0OQJEnabiVZXVW9bro3ZUsEJUmSJGnGMcGSJEmSpJa0sYvgNmvBPvMYdHmVJEmSpJY4gyVJkiRJLTHBkiRJkqSWmGBJkiRJUktm9DtYazdsZODE8/sdhtQKt2uXJEnqP2ewJEmSJKklJliSJEmS1BITLEmSJElqiQmWJEmSJLWk5wQryd5JViS5McnqJBckeWRTd0KSnyWZN6zPs5MMJrk2yZVJ3t+UL0+yKcmeXW3vGdb3eUkqyaOGlV+Y5M4k5w0rf2CSk5PckOSbSS5P8uzevwpJkiRJmpyeEqwkAc4FVlbVAVV1GPBWYK+myRLgCuAFXX0OAU4DXlpVBwOLgO90DXs78IYxLrsEuKz52e29wMtGaP93wHzgkKp6HPA8YOde7k+SJEmS2tDrDNbTgPuq6iNDBVV1VVWtSnIAMBd4B7+eDL0ZOKmqrm/ab66qD3fVnwEcm2S34RdLMhd4CvAKYHF3XVVdDNw9rP0c4M+A11bVz5t2P6qqz/Z4f5IkSZI0ab0mWIcAq0epWwysAFYBByXZq4c+APfQSbJeN0Ldc4ELq+rbwB1JDhsnvt8Gvl9Vd43TjiTLmmWLg5s3bRyvuSRJkiT1rI1NLpYAK6rqfuAc4EUT6HsqsDTJ8KV8S+gkbTQ/hy8T3GJVdXpVLaqqRbPmzBu/gyRJkiT1aIce260DjhlemGQBcCBwUec1LXYEbqLz7tU64DDgqtEGrao7k3waeHXXmLsBTwcWJClgFlBJ3lRVNcpQ3wF+K8kuvcxiSZIkSdJU6HUG6xJgdpJlQwVJDqUzA7W8qgaaz8OAhyXZj85mFG/r2mnwAUleNcLYHwBeya+SvWOAT1bVfs2Y+9JJ2g4fLbiq2gR8DPiHJDs219sjyURm0yRJkiRpUnpKsJqZo+cDRzXbtK8D3gUcQWd3wW7nAour6mrgBODsJNcB1wD7jzD27U2f2U3RkhHGPKcpJ8kq4HPAkUluTvLMps07gNuAa5NcA5wHOJslSZIkaavJ6Kvutn+z5x9Y85ee0u8wpFasP/nofocgSZK03UmyuqoW9dq+jU0uJEmSJEmYYEmSJElSa3rdRXC7tGCfeQy6rEqSJElSS5zBkiRJkqSWmGBJkiRJUktMsCRJkiSpJTP6Hay1GzYycOL5/Q5DmlJu3y5JkrT1OIMlSZIkSS0xwZIkSZKklphgSZIkSVJLTLAkSZIkqSXTMsFKUkne33X+xiTLm+PlSTYkWdP12TXJEUk2NudXJ/lKkj37dhOSJEmSZpxpmWABPwdekGT3Ueo/WFULuz53NuWrmvNDgSuAV2+NYCVJkiQJpm+C9UvgdOD1W9I5SYCdgZ+0GZQkSZIkjWW6JlgAHwKOSzJvhLrXdy0P/GpX+eFJ1gDfB44CzhjeMcmyJINJBjdv2jglgUuSJEmamaZtglVVdwGfAP5yhOruJYJP6yofWiK4L/Bx4D0jjHt6VS2qqkWz5oyUu0mSJEnSlpm2CVbjFOAVwIO3oO+XgKe2Go0kSZIkjWFaJ1hV9WPgs3SSrIl6CnBjuxFJkiRJ0uh26HcAPXg/8JphZa9P8tKu8+c1P4fewQqwEfjTKY9OkiRJkhrTMsGqqrldxz8C5nSdLweWj9BtPeBLVZIkSZL6ZlovEZQkSZKkbYkJliRJkiS1ZFouEdxaFuwzj8GTj+53GJIkSZK2E85gSZIkSVJLTLAkSZIkqSUmWJIkSZLUkhn9DtbaDRsZOPH8fochTbn1vmsoSZK0VTiDJUmSJEktMcGSJEmSpJaYYEmSJElSSyadYCXZO8mKJDcmWZ3kq0k2JVmT5MdJbmqOv5JkIEkleW1X/9OSHN8cn5lkQ5LZzfnuSdY3xwNJ7k1yZZLrknxjqF9T/6gklyf5eZI3Tva+JEmSJGmiJrXJRZIA5wJnVdXipuwxwC5VtSrJmcB5VfX5pm4AuBV4XZJ/rqpfjDDsZuBPgA+PUHdjVT22GWt/4AtJUlUfB34M/CXwvMnckyRJkiRtqcnOYD0NuK+qPjJUUFVXVdWqMfrcBlwMLB2l/hTg9UnGTP6q6rvA/6GTVFFVt1bVFcB9vYcvSZIkSe2ZbIJ1CLB6C/q9G3hjklkj1H0fuAx4WQ/jfBN41BZcX5IkSZJa15dNLprZp68DLxmlybuANzF+fJnotZMsSzKYZHDzpo0T7S5JkiRJo5psgrUOOGwL+/498BZGSJKq6gZgDfDiccZ4LHDdRC5aVadX1aKqWjRrzryJdJUkSZKkMU02wboEmJ1k2VBBkkOTHD5ex6q6HrgW+MNRmpwEjLobYLNhxvuAf5xIwJIkSZI0VSa1i2BVVZLnA6ckeQvwM2A9cEKPQ5wEXDnK2OuSfBN4XFfxAUmuBB4E3A2cWlVnQme7eGAQ2AW4P8kJwMFVddcEb0uSJEmStsikEiyAqvoBoyzlq6rjh52vp7MxxtD5VXTNoo3Q/gXD+u40Rhy3AA+fQOiSJEmS1Kq+bHIhSZIkSdsjEyxJkiRJaokJliRJkiS1ZNLvYG3LFuwzj8GTj+53GJIkSZK2E85gSZIkSVJLTLAkSZIkqSUmWJIkSZLUkhn9DtbaDRsZOPH8fochTRvrfSdRkiRpUpzBkiRJkqSWmGBJkiRJUktMsCRJkiSpJT0lWEn2TrIiyY1JVie5IMkjm7oTkvwsybxhfZ6dZDDJtUmuTPL+pnx5kk1J9uxqe8+wvs9LUkkeNaz8wiR3JjlvWPkOSf4+yQ1J1jSft0/sq5AkSZKkyRk3wUoS4FxgZVUdUFWHAW8F9mqaLAGuAF7Q1ecQ4DTgpVV1MLAI+E7XsLcDbxjjskuAy5qf3d4LvGyE9u8EHgYsqKqFwOHAA8e7N0mSJElqUy8zWE8D7quqjwwVVNVVVbUqyQHAXOAd/Hoy9GbgpKq6vmm/uao+3FV/BnBskt2GXyzJXOApwCuAxd11VXUxcPew9nOAPwNeW1U/a9rdXVXLe7g3SZIkSWpNLwnWIcDqUeoWAyuAVcBBSfbqoQ/APXSSrNeNUPdc4MKq+jZwR5LDxonvt4HvV9Xd47STJEmSpCk12U0ulgArqup+4BzgRRPoeyqwNMnOI43ZHK/gN5cJjinJHzfvYP1Pkn1HqF/WvBs2uHnTxokMLUmSJElj6uUPDa8DjhlemGQBcCBwUec1LXYEbqLz7tU64DDgqtEGrao7k3waeHXXmLsBTwcWJClgFlBJ3lRVNcpQ3wF+K8nOzdLAjwMfT3JN03/4dU8HTgeYPf/A0caUJEmSpAnrZQbrEmB2kmVDBUkOpTMDtbyqBprPw4CHJdmPzmYUb+vaafABSV41wtgfAF7JrxK9Y4BPVtV+zZj70knaDh8tuKraBHwMOC3Jg5rrzaKT8EmSJEnSVjNugtXMHD0fOKrZpn0d8C7gCDq7C3Y7F1hcVVcDJwBnJ7kOuAbYf4Sxb2/6zG6Kloww5jlNOUlWAZ8Djkxyc5JnNm3eDvwQuCbJlXTeCTsL+MF49ydJkiRJbcnoK++2f7PnH1jzl57S7zCkaWP9yUf3OwRJkqRpJcnqqlrUa/vJbnIhSZIkSWqYYEmSJElSS0ywJEmSJKklvWzTvt1asM88Bn3nRJIkSVJLnMGSJEmSpJaYYEmSJElSS0ywJEmSJKklM/odrLUbNjJw4vn9DkPapvi3siRJkkbnDJYkSZIktcQES5IkSZJaYoIlSZIkSS2ZVIKV5J6u4z9I8u0k+yXZO8mKJDcmWZ3kgiSPTDKQpJK8tqvfaUmOb47PTLIpyc5d9ac0fXbvKnteU/aoYfFcmOTOJOdN5r4kSZIkaUu0MoOV5EjgVODZwPeBc4GVVXVAVR0GvBXYq2l+K/C6JDuOMtx3gOc24z4AeDqwYVibJcBlzc9u7wVeNrm7kSRJkqQtM+kEK8lTgX8BnlNVNwJPA+6rqo8Mtamqq6pqVXN6G3AxsHSUIVcAxzbHRwD/Dfyy63pzgacArwAWd3esqouBuyd5S5IkSZK0RSabYM0G/g14XlVd35QdAqwep9+7gTcmmTVC3beBPZI8hM4M1Yph9c8FLqyqbwN3JDlsS4OXJEmSpDZNNsG6D/h/dGaTelZV3wW+DrxklCZfoDM79QRg1bC67qRrBb+5THBMSZYlGUwyuHnTxol0lSRJkqQxTTbBuh94MfD4JG9rytYBvcwq/T3wFiAj1H0G+Dvgoqq6f6gwyW503sn6aJL1wJuAFycZaYwRVdXpVbWoqhbNmjOv126SJEmSNK5Jv4NVVZuAo4HjkrwCuASYnWTZUJskhyY5fFi/64FrgT8cYczvAW8H/mlY1THAJ6tqv6oaqKp9gZuAw4ePIUmSJElbWyu7CFbVj4FnAe+gkzA9Hziq2aZ9HfAu4JYRup4EPHyUMf+52TSj2xI6OxR2O6cpJ8kq4HPAkUluTvLMLbwlSZIkSZqwHSbTuarmdh3/D/CIruoXj9LtkK4+V9GV5FXV8aNcZ6A5fNoIdad2HTuTJUmSJKlvWpnBkiRJkiSZYEmSJElSa0ywJEmSJKklk3oHa1u3YJ95DJ58dL/DkCRJkrSdcAZLkiRJklpigiVJkiRJLTHBkiRJkqSWzOh3sNZu2MjAief3OwxpxlrvO5CSJGk74wyWJEmSJLXEBEuSJEmSWmKCJUmSJEkt2aIEK8k9w86PT3Jac7w8yYYka7o+uyY5Isl5I4y1MsmiYWVHJNnY9L0uyd901S1MUkmeNazP5qb9NUn+PcmuW3JvkiRJkrSlpmoG64NVtbDrc+cWjLGqqhYCi4CXJnlcU74EuKz52e3e5lqHAD8GXr2FsUuSJEnSFpn2SwSr6qfAauC3kwR4EXA88PtJHjRKt8uBfbZOhJIkSZLUsaUJ1k7dSwCB/zus/vVd9V+dTIBJHgo8EVgHPBm4qapuBFYCv7HHc5JZwJHAlyZzXUmSJEmaqC39O1j3Nsv3gM47WHSW8g35YFW9bxJxARye5ErgfuDkqlrXvOe1oqlfAbwcOKc536lJ9vYBrgMuGmnQJMuAZQCzdtljkiFKkiRJ0q9M5z80vKqqnjN00sxMvRB4bpK3AwEemmTnqrqbJulLMgf4DzrvYJ06fNCqOh04HWD2/ANrK9yHJEmSpBli2r+D1eVI4Oqq2reqBqpqPzqzV8/vblRVm4C/BN6QZDonkJIkSZK2M1OVYHW/g7UmyUBTfmSSm7s+T2rKz+8q+9woYy4Bzh1Wdg6/uZsgVXUlcPVIdZIkSZI0VVI1c1fJzZ5/YM1fekq/w5BmrPUn/8Y+NZIkSdNKktVVtWj8lh3b0hJBSZIkSZrWTLAkSZIkqSUmWJIkSZLUkhm9y96CfeYx6DsgkiRJklriDJYkSZIktcQES5IkSZJaYoIlSZIkSS2Z0e9grd2wkYETz+93GJLUKv++mCRJ/eMMliRJkiS1xARLkiRJklpigiVJkiRJLWklwUpSSf6163yHJLclOW9Yu39L8rVhZcuTbEiyJsk1Sf6oq/yNzfGDklyUZHlzfkaSW5NcM2ys3Zp2NzQ/H9LG/UmSJElSL9qawfopcEiSnZrz3wc2dDdIsitwGDAvyf7D+n+wqhYCLwLOSPKArn47AucAq6tqeVN8JvCsEeI4Ebi4qg4ELm7OJUmSJGmraHOJ4AXA0NZVS4Czh9W/APh3YAWweKQBquo64JfA7k3RDsBngBuq6sSudpcCPx5hiOcCZzXHZwHPm+hNSJIkSdKWajPBWgEsTvIg4FDg68Pqh5Kus5vj35DkCcD9wG1N0ZuBX1TVCT3GsFdV/bA5vgXYq+foJUmSJGmSWkuwqupqYIBO8nRBd12SvYADgcuq6tvAfUkO6Wry+iRrgPcBx1ZVNeWXAU9O8sgtiKeAGl6eZFmSwSSDmzdtnOiwkiRJkjSqtncR/BKdJGn48sAXAw8Bbkqynl8lYkM+WFULq+rwqlrVVX4pcALw5STze7j+j4baNT9vHd6gqk6vqkVVtWjWnHm93ZUkSZIk9aDtBOsM4G+rau2w8iXAs6pqoKoG6Gx2MeJ7WMNV1Tl0krYLm40yxvIlYGlzvBT4Yo9xS5IkSdKktZpgVdXNVXVqd1mSAWA/4Gtd7W4CNjbvXPUy7oeBc4EvNVu2nw1cDhyU5OYkr2iangz8fpIbgKOac0mSJEnaKvKr151mntnzD6z5S0/pdxiS1Kr1Jx89fiNJktSTJKuralGv7dteIihJkiRJM5YJliRJkiS1xARLkiRJklqyQ78D6KcF+8xj0HcVJEmSJLXEGSxJkiRJaokJliRJkiS1xARLkiRJkloyo9/BWrthIwMnnt/vMCRJkiSxffwtR2ewJEmSJKklJliSJEmS1BITLEmSJElqybRMsJI8NMma5nNLkg1JNjfn1yb5cZKbmvOvJBlIUkle2zXGaUmO7+NtSJIkSZphpuUmF1V1B7AQIMly4J6qet9QfZIzgfOq6vPN+QBwK/C6JP9cVb/YyiFLkiRJ0vScwdpCtwEXA0v7HYgkSZKkmWl7SrAA3g28McmsfgciSZIkaebZrhKsqvou8HXgJaO1SbIsyWCSwc2bNm694CRJkiRt97arBKvx98BbgIxUWVWnV9Wiqlo0a868rRuZJEmSpO3adpdgVdX1wLXAH/Y7FkmSJEkzy3aXYDVOAh7e7yAkSZIkzSzTcpv2blW1fISy44edrwcO6Tq/iu03eZQkSZI0TZmESJIkSVJLTLAkSZIkqSUmWJIkSZLUkmn/DtZUWrDPPAZPPrrfYUiSJEnaTjiDJUmSJEktMcGSJEmSpJaYYEmSJElSS0ywJEmSJKklJliSJEmS1BITLEmSJElqiQmWJEmSJLXEBEuSJEmSWmKCJUmSJEktMcGSJEmSpJaYYEmSJElSS0ywJEmSJKklJliSJEmS1BITLEmSJElqiQmWJEmSJLXEBEuSJEmSWmKCJUmSJEktMcGSJEmSpJaYYEmSJElSS1JV/Y6hb5LcDXyr33FownYHbu93EJoQn9m2yee2bfK5bXt8Ztsmn9u2Z0uf2X5VtUevjXfYggtsT75VVYv6HYQmJsmgz23b4jPbNvnctk0+t22Pz2zb5HPb9mytZ+YSQUmSJElqiQmWJEmSJLVkpidYp/c7AG0Rn9u2x2e2bfK5bZt8btsen9m2yee27dkqz2xGb3IhSZIkSW2a6TNYkiRJktSaGZtgJXlWkm8l+U6SE/sdz0yQ5Iwktya5pqtstyQXJbmh+fmQpjxJTm2ez9VJHtfVZ2nT/oYkS7vKD0uytulzapKMdQ2NL8m+Sb6a5Nok65K8rin3uU1jSR6U5BtJrmqe29825Y9I8vXmu/5Mkh2b8tnN+Xea+oGusd7alH8ryTO7ykf8HTraNdSbJLOSXJnkvObcZzbNJVnf/A5bk2SwKfN35DSXZNckn09yfZLrkjzJ5zZ9JTmo+Wds6HNXkhOm7TOrqhn3AWYBNwL7AzsCVwEH9zuu7f0DPBV4HHBNV9l7gBOb4xOBdzfHfwB8GQjwRODrTfluwHebnw9pjh/S1H2jaZum77PHuoafnp7ZfOBxzfHOwLeBg31u0/vTfJdzm+MHAl9vvuPPAoub8o8Af94c/wXwkeZ4MfCZ5vjg5vfjbOARze/NWWP9Dh3tGn56fnb/B/g0cN5Y36fPbPp8gPXA7sPK/B05zT/AWcCfNsc7Arv63LaNT/P77BZgv+n6zPr+JfXpwTwJ+I+u87cCb+13XDPhAwzw6wnWt4D5zfF8On+bDOCfgSXD2wFLgH/uKv/npmw+cH1X+f+2G+0afrbo+X0R+H2f27bzAeYA3wSeQOePK+7QlP/v70HgP4AnNcc7NO0y/HfjULvRfoc2fUa8hp+entXDgYuBpwPnjfV9+symz4eREyx/R07jDzAPuIlmLwKf27b1AZ4B/Pd0fmYzdYngPsD/dJ3f3JRp69urqn7YHN8C7NUcj/aMxiq/eYTysa6hCWiWID2WzmyIz22aa5aarQFuBS6iM3txZ1X9smnS/V3/7/Np6jcCD2Xiz/OhY1xD4zsFeDNwf3M+1vfpM5s+CvjPJKuTLGvK/B05vT0CuA34eDpLcj+a5MH43LYVi4Gzm+Np+cxmaoKlaag6/2ugtvVrbI+SzAXOAU6oqru663xu01NVba6qhXRmRR4PPKq/EWksSZ4D3FpVq/sdiybsKVX1OODZwKuTPLW70t+R09IOdF5Z+HBVPRb4KZ2lX//L5zY9Ne+I/hHwueF10+mZzdQEawOwb9f5w5sybX0/SjIfoPl5a1M+2jMaq/zhI5SPdQ31IMkD6SRXn6qqLzTFPrdtRFXdCXyVztKvXZPs0FR1f9f/+3ya+nnAHUz8ed4xxjU0tt8F/ijJemAFnWWC/4DPbNqrqg3Nz1uBc+n8Dw1/R05vNwM3V9XXm/PP00m4fG7T37OBb1bVj5rzafnMZmqCdQVwYDo7J+1IZ6rxS32Oaab6ErC0OV5K5x2fofKXN7vAPBHY2EzP/gfwjCQPaXZxeQad9wV+CNyV5InNri8vHzbWSNfQOJrv8mPAdVX1ga4qn9s0lmSPJLs2xzvReW/uOjqJ1jFNs+HPbei7Pga4pPm/dF8CFqezY90jgAPpvAQ84u/Qps9o19AYquqtVfXwqhqg831eUlXH4TOb1pI8OMnOQ8d0frddg78jp7WqugX4nyQHNUVHAtfic9sWLOFXywNhuj6zfr+o1q8Pnd1Fvk3nvYS39zuemfBp/oH4IXAfnf979Ao66/8vBm4AvgLs1rQN8KHm+awFFnWN8yfAd5rPH3eVL6LzL7YbgdP41R/SHvEafnp6Zk+hMxV+NbCm+fyBz216f4BDgSub53YN8NdN+f50/mP7O3SWV8xuyh/UnH+nqd+/a6y3N8/mWzQ7KjXlI/4OHe0afib0/I7gV7sI+sym8af57q5qPuuGvld/R07/D7AQGGx+T/4bnR3lfG7T+AM8mM6s+7yusmn5zIY6SpIkSZImaaYuEZQkSZKk1plgSZIkSVJLTLAkSZIkqSUmWJIkSZLUEhMsSZIkSWqJCZYkzXBJNidZ0/U5seXxP5rk4B7bHpHkvDavP2z8XZP8xUSvl+T/JjlqAtc5IsnG5vu8OslXkuy5pXH3cL2VSb6V5Kok/931931GavuwJJ+fqlgkaaYzwZIk3VtVC7s+J7c5eFX9aVVd2+aYk7Ar8BfjNRquqv66qr4ywW6rmu/zUDp/6PfVE73uBB1XVY8BzgLeO1qjqvpBVR0zvDzJDlMZnCTNFCZYkqTfkGReMyNyUHN+dpI/S8d7k1yTZG2SY5v6I5pZlM8nuT7Jp5KkqVuZZFFz/Kwk32xmWi6eQDzPSHJ50/dzSeY25euT/G1TvjbJo5ryPZJclGRdM4P2vSS7AycDBzQzS0NJyNyR4h52/TOTHDPWNceIPcDOwE+a88c393Jlkv/X9R3/TpJvdM16HdiUv7Sr/J+TzBrn67oU+O0kA0lWNXF+M8mTm/EGklzTHB+f5EtJLgEuTjI/yaXNta5Jcvj4T0eS1M0ES5K007AlgsdW1UbgNcCZSRYDD6mqfwFeACwEHgMcBbw3yfxmnMcCJwAHA/sDv9t9kSR7AP8CvLCZaXlRL8E1idE7gKOq6nHAIPB/uprc3pR/GHhjU/Y3wCVV9TvA54HfaspPBG5sZpbe1EvcoxjpmsMdnmQN8H0639UZTfn1wOFV9Vjgr4G/b8pfBfxDVS0EFgE3J3k0cCzwu035ZuC4cWL7Q2AtcCvw+02cxwKnjtL+ccAxVfV7wEuA/2iu9RhgzTjXkiQN43IASdK9zX9Q/5qquijJi4AP0fmPbYCnAGdX1WbgR0n+C/j/gLuAb1TVzQBNYjEAXNY15BOBS6vqpmb8H/cY3xPpJD//3Uwu7Qhc3lX/hebnajoJ4FCcz2+uc2GSn4wx/nhxj2Skaw63qqqe04z7FuA9dJKoecBZzQxVAQ9s2l8OvD3Jw4EvVNUNSY4EDgOuaO59JzqJ00g+leReYD3w2mbc05IspJOYPXKUfhd1PYsrgDOSPBD4t6paM+o3IEkakQmWJGlESR4APBrYBDwEuHmcLj/vOt5Me/+OCZ0kYMk4193Sa25J3BO95peAc5rjvwO+WlXPTzIArASoqk8n+TpwNHBBklfSufezquqtPVzjuKoaHDpJshz4EZ3k+AHAz0bp99Ohg6q6NMlTmxjOTPKBqvpED9eWJDVcIihJGs3rgevoLBv7eDOrsQo4NsmsZsnfU4Fv9Dje14CnJnkEQJLdJtDvd5P8dtPvwUlGm40Z8t/Ai5v2z6CTIALcTed9qK3tKcCNzfE8YENzfPxQgyT7A9+tqlOBLwKHAhcDx6TZgTDJbkn26/Ga84AfVtX9wMuA8d7dohn7R81y0I/SWT4oSZoAZ7AkSTs1S+OGXAh8HPhT4PFVdXeSS+m8B7UceBJwFZ3lbW+uqlvG2+gBoKpuS7IM+EIzO3Yr8PsjND0ySfds2YvoJCJnJ5ndlL0D+PYYl/vbpv3L6Cy9uwW4u6p+ns425tcAXwbOHy/uSRh6ByvARjrfJ3SWCp6V5B3Drv9i4GVJ7mvi/fuq+nHT7j+b7+w+OrsRfq+H6/8TcE6Sl9N5pj8dpz3AEcCbmhjuAV7eQx9JUpdUVb9jkCSpVU0itrmqfpnkScCHR3rPTJKktjmDJUnaHv0W8Nlm1ucXwJ/1OR5J0gzhDJYkSZIktcRNLiRJkiSpJSZYkiRJktQSEyxJkiRJaokJliRJkiS1xARLkiRJklpigiVJkiRJLfn/ActbLqvsbRvdAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 1008x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "if NORMALIZE_FOR_GENE_LENGTH:\n",
    "    annotation_gencode = pd.read_csv(gencode_annotation_path,\n",
    "                                 comment='#', sep='\\t', skiprows=7,header=None,\n",
    "                                 names=['chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attr']\n",
    "                                )\n",
    "    # derive length of all exons\n",
    "    annotation_gencode = annotation_gencode[annotation_gencode.type == 'exon']\n",
    "    annotation_gencode['length'] = np.abs(annotation_gencode.end - annotation_gencode.start)\n",
    "    # extract the gene name for each exon\n",
    "    def get_hugo_symbol(row):\n",
    "        s = row.attr\n",
    "        for elem in s.split(';'):\n",
    "            if elem.startswith('gene_name'):\n",
    "                return elem.split('=')[1].strip()\n",
    "        return None\n",
    "    annotation_gencode['Hugo_Symbol'] = annotation_gencode.apply(get_hugo_symbol, axis=1)\n",
    "    # add length of exons together for each gene\n",
    "    exonic_gene_lengths = annotation_gencode.groupby('Hugo_Symbol').length.sum()\n",
    "\n",
    "    # plot the longest proteins at the end\n",
    "    fig = plt.figure(figsize=(14, 8))\n",
    "    plt.barh(y = range(20), width=exonic_gene_lengths.sort_values(ascending=False).head(20), tick_label=exonic_gene_lengths.sort_values(ascending=False).head(20).index)\n",
    "    plt.xlabel('Exonic Length in Base Pairs')\n",
    "    plt.title('20 Longest Proteins')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2: Load CNAs from Disk and Preprocess\n",
    "In this step, we load the CNAs from TCGA per cancer type/study. We next normalize for gene length (if requested) and then average across samples."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.095333275757464e-06\n",
      "Processed PRAD\n",
      "6.477790068766598e-06\n",
      "Processed LIHC\n",
      "1.0517950324668259e-05\n",
      "Processed LUSC\n",
      "9.681745411548166e-06\n",
      "Processed BLCA\n",
      "6.557614652885231e-06\n",
      "Processed CESC\n",
      "1.2031022330974295e-06\n",
      "Processed KIRP\n",
      "9.865853516213125e-06\n",
      "Processed BRCA\n",
      "7.720146798420783e-06\n",
      "Processed STAD\n",
      "6.593935821214601e-06\n",
      "Processed HNSC\n",
      "5.006320353770443e-06\n",
      "Processed READ\n",
      "6.377408905760918e-06\n",
      "Processed UCEC\n",
      "7.849189345944306e-06\n",
      "Processed LUAD\n",
      "1.7644882185580586e-07\n",
      "Processed THCA\n",
      "3.5685833097423506e-06\n",
      "Processed COAD\n",
      "1.999203630805188e-06\n",
      "Processed KIRC\n",
      "1.1196488330553018e-05\n",
      "Processed ESCA\n"
     ]
    }
   ],
   "source": [
    "def process_cna_for_ctype(fname, gene_lengths=None):\n",
    "    # load data for cancer type\n",
    "    gene_sample_matrix = pd.read_csv(fname, sep='\\t')\n",
    "    # remove the isoform from ensembl IDs\n",
    "    gene_sample_matrix.set_index(gene_sample_matrix['Gene Symbol'].str.split('.').str[0], inplace=True)\n",
    "    # drop uninformative columns\n",
    "    gene_sample_matrix.drop(['Gene Symbol', 'Gene ID', 'Cytoband'], axis=1, inplace=True)\n",
    "    if not gene_lengths is None:\n",
    "        bpmr = gene_sample_matrix.abs().sum() / float(gene_lengths.sum()) # mutation rate per base per patient (vector)\n",
    "        print (bpmr.mean())\n",
    "        expected_mutation_rates = pd.DataFrame(np.matlib.repmat(bpmr, gene_lengths.shape[0], 1) * gene_lengths.values.reshape(-1, 1),\n",
    "                                               index=gene_lengths.index,\n",
    "                                               columns=bpmr.index\n",
    "                                              ) # expected mutation frequency per gene and patient\n",
    "        denom = 1 + expected_mutation_rates.reindex_like(gene_sample_matrix).fillna(expected_mutation_rates.median(axis=0))\n",
    "        normalized_gene_sample_matrix = gene_sample_matrix.abs() / denom\n",
    "        cna_mean_matrix = normalized_gene_sample_matrix.mean(axis=1)\n",
    "    else:\n",
    "        # compute average of absolute CNAs (no cancelling out)\n",
    "        cna_mean_matrix = gene_sample_matrix.abs().mean(axis=1)\n",
    "    # get cancer type from file name\n",
    "    ctype = os.path.basename(fname).split('.')[0].strip()\n",
    "    # rename series to cancer type\n",
    "    cna_mean_matrix.rename(ctype, inplace=True)\n",
    "\n",
    "    return cna_mean_matrix, ctype\n",
    "\n",
    "cna_all_matrices = []\n",
    "cna_ctypes = []\n",
    "for dname in os.listdir(cna_base_dir):\n",
    "    ctype_dir = os.path.join(cna_base_dir, dname)\n",
    "    if os.path.isdir(ctype_dir):\n",
    "        for cna_gene_scores in os.listdir(ctype_dir):\n",
    "            if cna_gene_scores.endswith('focal_score_by_genes.txt'):\n",
    "                if NORMALIZE_FOR_GENE_LENGTH:\n",
    "                    avg_cna, ctype = process_cna_for_ctype(os.path.join(ctype_dir, cna_gene_scores),\n",
    "                                                           gene_lengths=exonic_gene_lengths)\n",
    "                else:\n",
    "                    avg_cna, ctype = process_cna_for_ctype(os.path.join(ctype_dir, cna_gene_scores), gene_lengths=None)\n",
    "                cna_all_matrices.append(avg_cna)\n",
    "                cna_ctypes.append(ctype)\n",
    "                print (\"Processed {}\".format(ctype))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 3: Merge and Provide Gene Symbols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "cna_mean_matrix2 = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), cna_all_matrices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Gene Symbol\n",
       "ENSG00000177694    0.195310\n",
       "ENSG00000186153    0.173605\n",
       "ENSG00000168702    0.157173\n",
       "ENSG00000145012    0.155726\n",
       "ENSG00000264545    0.154464\n",
       "                     ...   \n",
       "ENSG00000183035    0.021962\n",
       "ENSG00000153779    0.021880\n",
       "ENSG00000147138    0.021800\n",
       "ENSG00000147183    0.021416\n",
       "ENSG00000102271    0.021127\n",
       "Length: 19729, dtype: float64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cna_mean_matrix2.mean(axis=1).sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Gene Symbol</th>\n",
       "      <th>ENSG00000141510</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>PRAD</th>\n",
       "      <td>0.181275</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LIHC</th>\n",
       "      <td>0.097625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>UCEC</th>\n",
       "      <td>0.065693</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BLCA</th>\n",
       "      <td>0.057831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BRCA</th>\n",
       "      <td>0.056962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESCA</th>\n",
       "      <td>0.054054</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LUAD</th>\n",
       "      <td>0.050450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>STAD</th>\n",
       "      <td>0.045455</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSC</th>\n",
       "      <td>0.041825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LUSC</th>\n",
       "      <td>0.036260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CESC</th>\n",
       "      <td>0.030303</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COAD</th>\n",
       "      <td>0.023715</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>READ</th>\n",
       "      <td>0.023529</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KIRC</th>\n",
       "      <td>0.008489</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KIRP</th>\n",
       "      <td>0.006601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>THCA</th>\n",
       "      <td>0.003906</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Gene Symbol  ENSG00000141510\n",
       "PRAD                0.181275\n",
       "LIHC                0.097625\n",
       "UCEC                0.065693\n",
       "BLCA                0.057831\n",
       "BRCA                0.056962\n",
       "ESCA                0.054054\n",
       "LUAD                0.050450\n",
       "STAD                0.045455\n",
       "HNSC                0.041825\n",
       "LUSC                0.036260\n",
       "CESC                0.030303\n",
       "COAD                0.023715\n",
       "READ                0.023529\n",
       "KIRC                0.008489\n",
       "KIRP                0.006601\n",
       "THCA                0.003906"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cna_mean_matrix2[cna_mean_matrix2.index == 'ENSG00000141510'].T.sort_values(by='ENSG00000141510', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Gene Symbol\n",
       "ENSG00000177694    0.192954\n",
       "ENSG00000186153    0.171760\n",
       "ENSG00000168702    0.155265\n",
       "ENSG00000145012    0.153694\n",
       "ENSG00000172264    0.152814\n",
       "                     ...   \n",
       "ENSG00000183035    0.021589\n",
       "ENSG00000153779    0.021508\n",
       "ENSG00000147138    0.021429\n",
       "ENSG00000147183    0.021066\n",
       "ENSG00000102271    0.020768\n",
       "Length: 19729, dtype: float64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cna_mean_matrix[cna_mean_matrix.index == 'ENSG00000141510'].T.sort_values(by='ENSG00000141510', ascending=False)\n",
    "cna_mean_matrix.mean(axis=1).sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "querying 1-1000...done.\n",
      "querying 1001-2000...done.\n",
      "querying 2001-3000...done.\n",
      "querying 3001-4000...done.\n",
      "querying 4001-5000...done.\n",
      "querying 5001-6000...done.\n",
      "querying 6001-7000...done.\n",
      "querying 7001-8000...done.\n",
      "querying 8001-9000...done.\n",
      "querying 9001-10000...done.\n",
      "querying 10001-11000...done.\n",
      "querying 11001-12000...done.\n",
      "querying 12001-13000...done.\n",
      "querying 13001-14000...done.\n",
      "querying 14001-15000...done.\n",
      "querying 15001-16000...done.\n",
      "querying 16001-17000...done.\n",
      "querying 17001-18000...done.\n",
      "querying 18001-19000...done.\n",
      "querying 19001-19729...done.\n",
      "Finished.\n",
      "280 input query terms found no hit:\n",
      "\t['ENSG00000273547', 'ENSG00000278566', 'ENSG00000279244', 'ENSG00000237847', 'ENSG00000268991', 'ENS\n"
     ]
    }
   ],
   "source": [
    "cna_mean_matrix = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), cna_all_matrices)\n",
    "# get gene symbols\n",
    "ensembl_symbol_map = utils.get_symbols_from_ensembl(cna_mean_matrix.index)\n",
    "cna_mean_with_names = cna_mean_matrix.join(ensembl_symbol_map)\n",
    "cna_mean_with_names.dropna(axis=0, inplace=True)\n",
    "cna_mean_with_names.set_index('Symbol', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Symbol</th>\n",
       "      <th>SOX2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>LUSC</th>\n",
       "      <td>0.503304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESCA</th>\n",
       "      <td>0.314683</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSC</th>\n",
       "      <td>0.267196</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CESC</th>\n",
       "      <td>0.199848</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>UCEC</th>\n",
       "      <td>0.144982</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BRCA</th>\n",
       "      <td>0.135574</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>STAD</th>\n",
       "      <td>0.131980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BLCA</th>\n",
       "      <td>0.113631</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LUAD</th>\n",
       "      <td>0.106456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LIHC</th>\n",
       "      <td>0.075662</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>READ</th>\n",
       "      <td>0.058429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PRAD</th>\n",
       "      <td>0.045427</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KIRC</th>\n",
       "      <td>0.043919</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COAD</th>\n",
       "      <td>0.033183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KIRP</th>\n",
       "      <td>0.019682</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>THCA</th>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Symbol      SOX2\n",
       "LUSC    0.503304\n",
       "ESCA    0.314683\n",
       "HNSC    0.267196\n",
       "CESC    0.199848\n",
       "UCEC    0.144982\n",
       "BRCA    0.135574\n",
       "STAD    0.131980\n",
       "BLCA    0.113631\n",
       "LUAD    0.106456\n",
       "LIHC    0.075662\n",
       "READ    0.058429\n",
       "PRAD    0.045427\n",
       "KIRC    0.043919\n",
       "COAD    0.033183\n",
       "KIRP    0.019682\n",
       "THCA    0.000000"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cna_mean_with_names[cna_mean_with_names.index == 'SOX2'].T.sort_values(by='SOX2', ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 4: Write to Disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "cna_mean_with_names = cna_mean_with_names[~cna_mean_with_names.index.duplicated(keep='first')]\n",
    "cna_mean_with_names.to_csv('../../data/pancancer/TCGA/mutation/CNA_mean_TCGA.tsv', sep='\\t')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
